{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Naive Bayes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Questão 1\n",
    "\n",
    "Implemente um classifador Naive Bayes para o problema de predizer a qualidade de um carro. Para este fim, utilizaremos um conjunto de dados referente a qualidade de carros, disponível no [UCI](https://archive.ics.uci.edu/ml/datasets/car+evaluation). Este dataset de carros possui as seguintes features e classe:\n",
    "\n",
    "** Attributos **\n",
    "1. buying: vhigh, high, med, low\n",
    "2. maint: vhigh, high, med, low\n",
    "3. doors: 2, 3, 4, 5, more\n",
    "4. persons: 2, 4, more\n",
    "5. lug_boot: small, med, big\n",
    "6. safety: low, med, high\n",
    "\n",
    "** Classes **\n",
    "1. unacc, acc, good, vgood"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Modelo 1\n",
    "\n",
    "Mapeando features em numéricos e utilizando distribuição normal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "      <th>min</th>\n",
       "      <th>25%</th>\n",
       "      <th>50%</th>\n",
       "      <th>75%</th>\n",
       "      <th>max</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>100.0</td>\n",
       "      <td>76.95972</td>\n",
       "      <td>1.571422</td>\n",
       "      <td>72.504378</td>\n",
       "      <td>76.138354</td>\n",
       "      <td>76.707531</td>\n",
       "      <td>77.802102</td>\n",
       "      <td>81.961471</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   count      mean       std        min        25%        50%        75%  \\\n",
       "0  100.0  76.95972  1.571422  72.504378  76.138354  76.707531  77.802102   \n",
       "\n",
       "         max  \n",
       "0  81.961471  "
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import csv\n",
    "import math\n",
    "import random\n",
    "from collections import defaultdict\n",
    "\n",
    "# retorna uma função que tenta converter uma string\n",
    "#    em int\n",
    "#    se não conseguir, retorna o valor associado à string\n",
    "#    no map not_int_dict\n",
    "def try_int(not_int_dict):\n",
    "    \n",
    "    def _try_int(x):\n",
    "        \n",
    "        try:\n",
    "            return int(x)\n",
    "        except:\n",
    "            return not_int_dict[x]\n",
    "    return _try_int\n",
    "\n",
    "# lista de funções de mapeamento dos valores string em numérico\n",
    "map_classe_into_int = {'vgood': 4, 'good': 3, 'acc': 2, 'unacc': 1}\n",
    "map_int_into_classe = {v:k for k, v in map_classe_into_int.items()}\n",
    "preprocess_list = [\n",
    "    lambda x: {'vhigh': 4, 'high': 3, 'med': 2, 'low': 1}[x],\n",
    "    lambda x: {'vhigh': 4, 'high': 3, 'med': 2, 'low': 1}[x],\n",
    "    try_int({'5more': 6}),\n",
    "    try_int({'more': 5}),\n",
    "    lambda x: {'big': 3, 'med': 2, 'small': 1}[x],\n",
    "    lambda x: {'high': 3, 'med': 2, 'low': 1}[x],\n",
    "    lambda x: map_classe_into_int[x]\n",
    "]\n",
    "\n",
    "# converte uma linha contendo valores string em valores numéricos\n",
    "def preprocess_row(row):\n",
    "    \n",
    "    return [f(v) for f, v in zip(preprocess_list, row)]\n",
    "\n",
    "def loadCsv(filename):\n",
    "    rows = csv.reader(open(filename, \"r\"))\n",
    "    ds = [preprocess_row(row) for row in rows]\n",
    "\n",
    "    return ds\n",
    "\n",
    "def splitDataset(dataset, splitRatio):\n",
    "    trainSize = int(len(dataset) * splitRatio)\n",
    "    trainSet = []\n",
    "    copy = list(dataset)\n",
    "    while len(trainSet) < trainSize:\n",
    "        index = random.randrange(len(copy))\n",
    "        trainSet.append(copy.pop(index))\n",
    "    return [trainSet, copy]\n",
    "\n",
    "def separateByClass(dataset):\n",
    "    separated = {}\n",
    "    for i in range(len(dataset)):\n",
    "        vector = dataset[i]\n",
    "        if (vector[-1] not in separated):\n",
    "            separated[vector[-1]] = []\n",
    "        separated[vector[-1]].append(vector)\n",
    "    return separated\n",
    "\n",
    "def mean(numbers):\n",
    "    return sum(numbers)/float(len(numbers))\n",
    " \n",
    "def stdev(numbers):\n",
    "    avg = mean(numbers)\n",
    "    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)\n",
    "    return math.sqrt(variance)\n",
    "\n",
    "def summarize(dataset):\n",
    "    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]\n",
    "    del summaries[-1]\n",
    "    return summaries\n",
    "\n",
    "def summarizeByClass(dataset):\n",
    "    separated = separateByClass(dataset)\n",
    "    summaries = {}\n",
    "    for classValue, instances in separated.items():\n",
    "        summaries[classValue] = summarize(instances)\n",
    "    return summaries\n",
    "\n",
    "def calculateProbability(x, mean, stdev):\n",
    "    \n",
    "    if stdev == 0:\n",
    "        return 1. if x == mean else .1        \n",
    "    \n",
    "    exponent = math.exp(-(x-mean)**2/(2*stdev**2))\n",
    "    return (1 / ((2*math.pi) * stdev ** 2)**.5) * exponent\n",
    "\n",
    "def calculateClassProbabilities(summaries, inputVector):\n",
    "    probabilities = {}\n",
    "    for classValue, classSummaries in summaries.items():\n",
    "        probabilities[classValue] = 1\n",
    "        for i in range(len(classSummaries)):\n",
    "            mean, stdev = classSummaries[i]\n",
    "            x = inputVector[i]\n",
    "            probabilities[classValue] *= calculateProbability(x, mean, stdev)\n",
    "    return probabilities\n",
    "\n",
    "def predict(summaries, inputVector):\n",
    "    probabilities = calculateClassProbabilities(summaries, inputVector)\n",
    "    bestLabel, bestProb = None, -1\n",
    "    for classValue, probability in probabilities.items():\n",
    "        if bestLabel is None or probability > bestProb:\n",
    "            bestProb = probability\n",
    "            bestLabel = classValue\n",
    "    return bestLabel, bestProb\n",
    "\n",
    "def getPredictions(summaries, testSet):\n",
    "    predictions = []\n",
    "    probs = []\n",
    "    for i in range(len(testSet)):\n",
    "        result, prob = predict(summaries, testSet[i])\n",
    "        predictions.append(result)\n",
    "        probs.append(prob)\n",
    "    return predictions, probs\n",
    "\n",
    "def getAccuracy(testSet, predictions):\n",
    "    correct = 0\n",
    "    for i in range(len(testSet)):\n",
    "        if testSet[i][-1] == predictions[i]:\n",
    "            correct += 1\n",
    "    return (correct/float(len(testSet))) * 100.0\n",
    "\n",
    "def train_and_test(filepath):\n",
    "    \n",
    "    accs = []\n",
    "    \n",
    "    ds = loadCsv(filepath)\n",
    "    \n",
    "    for i in range(100):\n",
    "\n",
    "        train, test = splitDataset(ds, splitRatio=.67)\n",
    "        summaries = summarizeByClass(train)\n",
    "\n",
    "        predictions, _ = getPredictions(summaries, test)\n",
    "        \n",
    "        accs.append(getAccuracy(test, predictions))\n",
    "    \n",
    "    return accs\n",
    "\n",
    "accs = train_and_test(\"carData.csv\")\n",
    "\n",
    "import pandas as pd\n",
    "pd.Series(accs).describe().to_frame().T"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Modelo utilizando frequência dos valores das features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 210,
   "metadata": {},
   "outputs": [],
   "source": [
    "# leitura do arquivo\n",
    "df = pd.read_csv(\"carData.csv\", header=None)\n",
    "\n",
    "# índice das colunas de features\n",
    "f_cols = list(range(0, 6))\n",
    "\n",
    "# frequência de registros por classe\n",
    "freq_classes = df.groupby(6).size()\n",
    "\n",
    "# classes\n",
    "classes = df[6].unique()\n",
    "\n",
    "# probabilidade atribuída às combinações de feature/classe\n",
    "#    que não possuem exemplos no dataset\n",
    "DEFAULT_NO_FREQ_PROB = 0.01\n",
    "\n",
    "def fit(df):\n",
    "    # probabilidades estimadas como #(feature, classe)/#classe\n",
    "    base_probs = {}\n",
    "\n",
    "    # para cada feature\n",
    "    for f_col in f_cols:\n",
    "\n",
    "        # matriz de probabilidades da feature x classes\n",
    "        #              agrupa pela classe e pela feature\n",
    "        #              manda índice da feature para as colunas\n",
    "        #              divide pela frequência das classes\n",
    "        #              preenche os missing values(pares não presentes no dataset)\n",
    "        base_probs[f_col] = df.groupby([6, f_col]).size()\\\n",
    "                         .unstack() \\\n",
    "                         .div(freq_classes, axis=0)\\\n",
    "                         .fillna(DEFAULT_NO_FREQ_PROB) \n",
    "\n",
    "    # função que realiza a predição sobre um dataset\n",
    "    #    a partir das probabilidades estimadas no passo anterior\n",
    "    def predict(ds):\n",
    "\n",
    "        predictions = []\n",
    "        probs = []\n",
    "        # para cada exemplo\n",
    "        for ix, row in ds.iterrows():\n",
    "            prob = {}\n",
    "            # para cada classe\n",
    "            for classe in classes:\n",
    "                # calcula a probabilidade do exemplo estar associado à classe\n",
    "                prob[classe] = 1.\n",
    "                # como o produto das probabilidades de cada uma de suas features pertencer à classe\n",
    "                for f_col in f_cols:\n",
    "                    prob[classe] = prob[classe] * base_probs[f_col].loc[classe, str(row[f_col])]\n",
    "            \n",
    "            predictions.append(max(prob.keys(), key=lambda key: prob[key]))\n",
    "            probs.append(prob)\n",
    "\n",
    "        return probs, predictions\n",
    "    \n",
    "    return predict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 199,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "      <th>min</th>\n",
       "      <th>25%</th>\n",
       "      <th>50%</th>\n",
       "      <th>75%</th>\n",
       "      <th>max</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>100.0</td>\n",
       "      <td>0.792137</td>\n",
       "      <td>0.023352</td>\n",
       "      <td>0.730298</td>\n",
       "      <td>0.778459</td>\n",
       "      <td>0.794221</td>\n",
       "      <td>0.807793</td>\n",
       "      <td>0.842382</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   count      mean       std       min       25%       50%       75%       max\n",
       "0  100.0  0.792137  0.023352  0.730298  0.778459  0.794221  0.807793  0.842382"
      ]
     },
     "execution_count": 199,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "accs = []\n",
    "\n",
    "for x in range(100):\n",
    "\n",
    "    # particiona o dataset em treino(77%) e test(33%)\n",
    "    df_train, df_test = train_test_split(df, test_size=.33)\n",
    "    \n",
    "    # treina\n",
    "    nb = fit(df_train)\n",
    "    # prediz\n",
    "    _, predictions = nb(df_test)\n",
    "    \n",
    "    # calcula acurácia\n",
    "    acc = accuracy_score(df_test[6], predictions)\n",
    "    accs.append(acc)\n",
    "\n",
    "pd.Series(accs).describe().to_frame().T"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Questão 2\n",
    "\n",
    "Crie uma versão de sua implementação usando as funções disponíveis na biblioteca SciKitLearn para o Naive Bayes ([veja aqui](http://scikit-learn.org/stable/modules/naive_bayes.html)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 200,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.naive_bayes import GaussianNB\n",
    "import numpy as np\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.metrics import confusion_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 201,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "ds = loadCsv(\"carData.csv\")\n",
    "df = pd.DataFrame(ds, columns=range(7))\n",
    "\n",
    "X = df[list(range(6))]\n",
    "y = df[6]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 248,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "      <th>mean</th>\n",
       "      <th>std</th>\n",
       "      <th>min</th>\n",
       "      <th>25%</th>\n",
       "      <th>50%</th>\n",
       "      <th>75%</th>\n",
       "      <th>max</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>100.0</td>\n",
       "      <td>0.769632</td>\n",
       "      <td>0.015746</td>\n",
       "      <td>0.735552</td>\n",
       "      <td>0.757881</td>\n",
       "      <td>0.768827</td>\n",
       "      <td>0.779772</td>\n",
       "      <td>0.803853</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   count      mean       std       min       25%       50%       75%       max\n",
       "0  100.0  0.769632  0.015746  0.735552  0.757881  0.768827  0.779772  0.803853"
      ]
     },
     "execution_count": 248,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "accs = []\n",
    "\n",
    "for x in range(100):\n",
    "\n",
    "    # particiona dataset em treino(77%) e teste(33%)\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33)\n",
    "    \n",
    "    nb = GaussianNB()\n",
    "    # prediz\n",
    "    nb.fit(X_train, y_train)\n",
    "    \n",
    "    # calcula acurácia\n",
    "    acc = accuracy_score(y_test, nb.predict(X_test))\n",
    "    accs.append(acc)\n",
    "    \n",
    "pd.Series(accs).describe().to_frame().T"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Questão 3\n",
    "\n",
    "Analise a acurácia dos dois algoritmos e discuta a sua solução."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "Comparando os dois últimos modelos"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 233,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix_train, ix_test = train_test_split(np.arange(df.shape[0]), test_size=.33, random_state=100)\n",
    "\n",
    "# modelo 1\n",
    "#    estimativa de probabilidade utilizando a frequência das features\n",
    "df_train, df_test = df.iloc[ix_train, :], df.iloc[ix_test, :]\n",
    "\n",
    "nb1 = fit(df_train)\n",
    "probs_1, predictions_1 = nb1(df_test)\n",
    "probs_1 = pd.DataFrame(probs_1, index=df_test.index)\n",
    "\n",
    "# modelo 2\n",
    "#    estimativa de probabilidade utilizando distribuição normal das features\n",
    "ds = pd.DataFrame(loadCsv(\"carData.csv\"), columns=range(7))\n",
    "X_train, y_train, X_test, y_test = ds.iloc[ix_train, list(range(6))], ds.iloc[ix_train, 6],\\\n",
    "                                        ds.iloc[ix_test, list(range(6))], ds.iloc[ix_test, 6]\n",
    "    \n",
    "nb2 = GaussianNB().fit(X_train, y_train)\n",
    "predictions_2 = [map_int_into_classe[p] for p in nb2.predict(X_test)]\n",
    "probs_2 = nb2.predict_proba(X_test)\n",
    "probs_2 = pd.DataFrame(probs_2, columns=[map_int_into_classe[v] for v in nb2.classes_], index=df_test.index)\n",
    "\n",
    "# juntando as predições\n",
    "predictions = pd.DataFrame({'model_1': predictions_1, 'model_2': predictions_2, 'correct': df_test.loc[:, 6]}, index=df_test.index)\n",
    "predictions = pd.merge(df_test, predictions, left_index=True, right_index=True)\n",
    "predictions.loc[:, 'model_1'] = predictions.model_1.astype('category', categories=['unacc', 'acc', 'good', 'vgood'], ordered=True)\n",
    "predictions.loc[:, 'model_2'] = predictions.model_2.astype('category', categories=['unacc', 'acc', 'good', 'vgood'], ordered=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### O que deu de diferente?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 234,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>model_2</th>\n",
       "      <th>unacc</th>\n",
       "      <th>acc</th>\n",
       "      <th>good</th>\n",
       "      <th>vgood</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>model_1</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>unacc</th>\n",
       "      <td>317.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>acc</th>\n",
       "      <td>69.0</td>\n",
       "      <td>69.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>40.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>good</th>\n",
       "      <td>3.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>7.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>vgood</th>\n",
       "      <td>7.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>33.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "model_2  unacc   acc  good  vgood\n",
       "model_1                          \n",
       "unacc    317.0   0.0   0.0    0.0\n",
       "acc       69.0  69.0   0.0   40.0\n",
       "good       3.0  13.0  13.0    7.0\n",
       "vgood      7.0   0.0   0.0   33.0"
      ]
     },
     "execution_count": 234,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions.groupby(['model_1', 'model_2']).size().unstack().fillna(0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Analisando um caso em que model_1 classificou como muito bom e model_2 classificou como muito ruim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 235,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>correct</th>\n",
       "      <th>model_1</th>\n",
       "      <th>model_2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1466</th>\n",
       "      <td>low</td>\n",
       "      <td>high</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>big</td>\n",
       "      <td>high</td>\n",
       "      <td>unacc</td>\n",
       "      <td>unacc</td>\n",
       "      <td>vgood</td>\n",
       "      <td>unacc</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        0     1  2  3    4     5      6 correct model_1 model_2\n",
       "1466  low  high  4  2  big  high  unacc   unacc   vgood   unacc"
      ]
     },
     "execution_count": 235,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions[(predictions.model_1 == 'vgood') & (predictions.model_2 == 'unacc')].sample()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 236,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(acc      0.000003\n",
       " good     0.000033\n",
       " unacc    0.000038\n",
       " vgood    0.000048\n",
       " Name: 1662, dtype: float64, unacc    0.850889\n",
       " acc      0.025733\n",
       " good     0.123378\n",
       " vgood    0.000000\n",
       " Name: 1662, dtype: float64)"
      ]
     },
     "execution_count": 236,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "probs_1.loc[1662]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 237,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "unacc    0.850889\n",
       "acc      0.025733\n",
       "good     0.123378\n",
       "vgood    0.000000\n",
       "Name: 1662, dtype: float64"
      ]
     },
     "execution_count": 237,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "probs_2.loc[1662]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 238,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>4</th>\n",
       "      <th>big</th>\n",
       "      <th>med</th>\n",
       "      <th>small</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>vgood</th>\n",
       "      <td>0.680851</td>\n",
       "      <td>0.319149</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>acc</th>\n",
       "      <td>0.375969</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.290698</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>good</th>\n",
       "      <td>0.369565</td>\n",
       "      <td>0.304348</td>\n",
       "      <td>0.326087</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unacc</th>\n",
       "      <td>0.295285</td>\n",
       "      <td>0.328784</td>\n",
       "      <td>0.375931</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "4           big       med     small\n",
       "6                                  \n",
       "vgood  0.680851  0.319149       NaN\n",
       "acc    0.375969  0.333333  0.290698\n",
       "good   0.369565  0.304348  0.326087\n",
       "unacc  0.295285  0.328784  0.375931"
      ]
     },
     "execution_count": 238,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# ter a feature lug_boot = big parece contribuir bastante para a probabilidade do modelo 1\n",
    "#    dado que a frequência de lug_boot = big na classe vgood é bastante alta(68%)\n",
    "df_train.groupby([6, 4]).size().unstack().div(df_train.groupby(6).size(), axis=0).sort_values('big', ascending=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Analisando um caso em que model_2 classificou como muito bom e model_1 classificou como muito ruim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 240,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>correct</th>\n",
       "      <th>model_1</th>\n",
       "      <th>model_2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [0, 1, 2, 3, 4, 5, 6, correct, model_1, model_2]\n",
       "Index: []"
      ]
     },
     "execution_count": 240,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions[(predictions.model_2 == 'vgood') & (predictions.model_1 == 'unacc')]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Analisando um caso em que ambos classificaram como muito ruim quando era muito bom"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 242,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>correct</th>\n",
       "      <th>model_1</th>\n",
       "      <th>model_2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [0, 1, 2, 3, 4, 5, 6, correct, model_1, model_2]\n",
       "Index: []"
      ]
     },
     "execution_count": 242,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions[(predictions.model_2 == 'unacc') & (predictions.model_1 == 'unacc') & (predictions.correct == \"vgood\")]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Analisando um caso em que ambos classificaram como muito bom quando era muito ruim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 244,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>correct</th>\n",
       "      <th>model_1</th>\n",
       "      <th>model_2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1682</th>\n",
       "      <td>low</td>\n",
       "      <td>low</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>big</td>\n",
       "      <td>high</td>\n",
       "      <td>unacc</td>\n",
       "      <td>unacc</td>\n",
       "      <td>vgood</td>\n",
       "      <td>vgood</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1601</th>\n",
       "      <td>low</td>\n",
       "      <td>med</td>\n",
       "      <td>5more</td>\n",
       "      <td>2</td>\n",
       "      <td>big</td>\n",
       "      <td>high</td>\n",
       "      <td>unacc</td>\n",
       "      <td>unacc</td>\n",
       "      <td>vgood</td>\n",
       "      <td>vgood</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        0    1      2  3    4     5      6 correct model_1 model_2\n",
       "1682  low  low      4  2  big  high  unacc   unacc   vgood   vgood\n",
       "1601  low  med  5more  2  big  high  unacc   unacc   vgood   vgood"
      ]
     },
     "execution_count": 244,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions[(predictions.model_2 == 'vgood') & (predictions.model_1 == 'vgood') & (predictions.correct == \"unacc\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 246,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6\n",
       "acc      258\n",
       "good      46\n",
       "unacc    806\n",
       "vgood     47\n",
       "dtype: int64"
      ]
     },
     "execution_count": 246,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# observar que a frequência de exemplos da classe vgood é bem menor que da classe unacc\n",
    "df_train.groupby(6).size()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
